Geospatial Data
#------------------------read suburb data---------------------------------------
#read in suburb boundaries
locs <- readOGR('other_data/VIC_LOCALITY_POLYGON_shp.shp',
GDAL1_integer64_policy = TRUE,stringsAsFactors = F,verbose=F)
#locs$VIC_LOCA_2 has suburb names, parse them to be like analysis data
locs$VIC_LOCA_2 <- capitalize(tolower(locs$VIC_LOCA_2))
#--------------------aggregate data used in analysis----------------------------
suburb_data <- group_by(property_data,suburb) %>% summarise(
median_price = median(price,na.rm=T),
q25 = quantile(price,0.25,na.rm=T),
q75 = quantile(price,0.75,na.rm=T),
mean_lng = mean(lng,na.rm=T),
mean_lat = mean(lat,na.rm=T),
count0 = n(),
count = sum(!is.na(price))
)
#---------------------------combine suburb and analysis data--------------------
#find suburbs in both datasets
locs_subset <- locs[which(locs$VIC_LOCA_2 %in% levels(suburb_data$suburb)),]
#merge data from suburb_data to locs_subset
merged <- merge(locs_subset,suburb_data,by.x='VIC_LOCA_2',by.y='suburb')
#----------------------------setup interactive text-----------------------------
prettify <- function(number) {
format(round(number,0),big.mar=',',scientific = FALSE)
}
merged$string <- paste0(
merged$VIC_LOCA_2,
'<br>median price: $',prettify(merged$median_price),
'<br>25th%: $',prettify(merged$q25),
'<br>75th%: $',prettify(merged$q75),
'<br>count: ',merged$count
)
#----------------------------setup colours--------------------------------------
merged$median_perc <- percent_rank(merged$median_price)
pal <- colorNumeric('Blues',domain=range(merged$median_perc,na.rm=T))
#--------------------------------plot-------------------------------------------
lngview <- mean(merged$mean_lng,na.rm=T)
latview <- mean(merged$mean_lat,na.rm=T)
leaflet(merged) %>% addTiles() %>%
setView(lat=latview,lng =lngview , zoom=9) %>%
addPolygons(fillColor = ~pal(median_perc),
fillOpacity = 1,
weight=1,
label=~lapply(string,HTML)
)
#----------------------lnglat
leaflet(property_data) %>% addTiles %>% addCircles(lng=~lng,lat=~lat,
opacity = 0.5,
fillOpacity = 0.5,
color='#0078D7',
fillColor='#0078D7'
)
#lng and lat have very few missing, dont wanna bother imputing
loc_bool <- !is.na(property_data$lng) & !(is.na(property_data$lat))
Univariate EDA
#----------------------year_built
#1 property was apparaently built in 1196 and another next century
to_correct = which(property_data$year_built > 2030| property_data$year_built < 1788)
property_data[to_correct,'year_built'] = NA
year_built_bool <- !is.na(property_data$year_built)
#--------------------ncar
ggplot(data= property_data, aes(x=ncar,y=log(price))) +
geom_jitter(alpha=0.4,color='#0078D7')

#limit the scope of analysis to properties with fewer than 7 car parks
ncar_bool <- !is.na(property_data$ncar) & property_data$ncar<=6
#--------------------nbathroom
ggplot(property_data,aes(x=nbathroom,y=log(price))) +
geom_jitter(alpha=0.4,color='#0078D7')

#limit the analysis to properties with nbathroom <= 4
bath_bool <- !is.na(property_data$nbathroom) & (property_data$nbathroom<=4 & property_data$nbathroom>0)
#---------------------nrooms
ggplot(property_data,aes(x=nrooms,y=log(price))) +
geom_jitter(alpha=0.4,color='#0078D7')

#limit the analysis to properties with nrooms <=6
nroom_bool <- property_data$nrooms<=6 & property_data$nrooms>0
#---------------------nbedroom
sum(is.na(property_data$nbedroom))
## [1] 8217
has_bedroom = !is.na(property_data$nbedroom)
cor(property_data$nbedroom[has_bedroom],property_data$nrooms[has_bedroom])
## [1] 0.9467546
#dont include in analysis
#----------------------building_area
ggplot(property_data,aes(x=building_area,y=log(price))) +
geom_point(alpha=0.4,color='#0078D7')

#make ==0 NA
property_data[is.na(property_data$building_area) | property_data$building_area==0,'building_area'] = NA
#limit analysis to buildings with less than 1000 building_area
BA_bool <- !is.na(property_data$building_area) & property_data$building_area<1000
#----------------------land_area
ggplot(property_data,aes(x=land_area,y=log(price))) +
geom_point(alpha=0.4,color='#0078D7')

#limit analysis to properties with land_area < 10000
land_bool <- !is.na(property_data$land_area) & property_data$land_area < 10000
#---------------------actions
#gather bools of which rows can be kept, intersection will be kept
property_data <- property_data[nroom_bool & bath_bool & land_bool & BA_bool & ncar_bool &loc_bool & year_built_bool,]
#replot
ggplot(property_data,aes(x=building_area,y=log(price))) +
geom_point(alpha=0.4,color='#0078D7')

ggplot(property_data,aes(x=land_area,y=log(price))) +
geom_point(alpha=0.4,color='#0078D7')
